Overview of Machine Learning

from dataidea.packages import * # imports np, pd, plt etc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from dataidea.datasets import loadDataset
demo_df = loadDataset('demo')
cols = {"Age":"age", "Gender":"gender", "Marital Status":"marital_status", "Address":"address",
       "Income":"income","Income Category":"income_category", "Job Category":"job_category",}

demo_df.rename(columns=cols, inplace=True)
demo_df.columns
Index(['age', 'gender', 'marital_status', 'address', 'income',
       'income_category', 'job_category'],
      dtype='object')
demo_df.describe() #will only give us numerical values
age address income income_category job_category
count 200.000000 200.000000 200.000000 200.000000 200.000000
mean 42.475000 11.485000 76.305000 2.520000 1.950000
std 12.801122 10.365665 107.554647 1.065493 0.781379
min 19.000000 0.000000 11.000000 1.000000 1.000000
25% 32.000000 3.000000 27.000000 2.000000 1.000000
50% 43.000000 9.000000 44.500000 2.000000 2.000000
75% 51.000000 17.000000 76.000000 4.000000 3.000000
max 76.000000 51.000000 873.000000 4.000000 3.000000
demo_df.select_dtypes(include=["object"])
gender marital_status
0 f 1
1 m 0
2 f no answer
3 m 1
4 m no answer
... ... ...
195 f 0
196 f 1
197 f 1
198 m 0
199 m 0

200 rows × 2 columns

demo_df.select_dtypes(include=["object"]).describe()
gender marital_status
count 200 200
unique 4 3
top f 0
freq 99 102
demo_df["gender"].value_counts().index
Index(['f', 'm', '  f', '   m'], dtype='object', name='gender')
demo_df.gender.unique()
array(['f', 'm', '  f', '   m'], dtype=object)
demo_df2 = demo_df.replace(to_replace="  f", value="f")
demo_df2.gender.unique()
array(['f', 'm', '   m'], dtype=object)
gender_col = demo_df2.gender.replace(to_replace="   m", value="m")
gender_col
0      f
1      m
2      f
3      m
4      m
      ..
195    f
196    f
197    f
198    m
199    m
Name: gender, Length: 200, dtype: object
gender_col.unique()
array(['f', 'm'], dtype=object)
demo_df2["gender"] = gender_col
demo_df2.gender.unique()
array(['f', 'm'], dtype=object)
demo_df2.marital_status.unique()
array(['1', '0', 'no answer'], dtype=object)
demo_df2.marital_status.value_counts()
marital_status
0            102
1             93
no answer      5
Name: count, dtype: int64
demo_df2.select_dtypes(include=["number"]) #"float64","int64"
age address income income_category job_category
0 55 12 72.0 3.0 3
1 56 29 153.0 4.0 3
2 28 9 28.0 2.0 1
3 24 4 26.0 2.0 1
4 25 2 23.0 1.0 2
... ... ... ... ... ...
195 45 3 86.0 4.0 3
196 23 2 27.0 2.0 1
197 66 32 11.0 1.0 2
198 49 4 30.0 2.0 1
199 45 1 147.0 4.0 3

200 rows × 5 columns

demo_df2.isna().sum()
age                0
gender             0
marital_status     0
address            0
income             0
income_category    0
job_category       0
dtype: int64
plt.boxplot(demo_df2["income"])
{'whiskers': [<matplotlib.lines.Line2D>,
  <matplotlib.lines.Line2D>],
 'caps': [<matplotlib.lines.Line2D>,
  <matplotlib.lines.Line2D>],
 'boxes': [<matplotlib.lines.Line2D>],
 'medians': [<matplotlib.lines.Line2D>],
 'fliers': [<matplotlib.lines.Line2D>],
 'means': []}

#exercise: function to calucate outliers:
#lower fence = Q1 - 1.5(Q3-Q1)
#upper fence = Q3 + 1.5(Q3-Q1)
def getOutliers(column):

    q1 = np.quantile(column, 0.25)
    q3 = np.quantile(column, 0.75)
    interquantile_range = q3-q1
    lower_fence = q1 - 1.5*interquantile_range
    upper_fence = q3 + 1.5*interquantile_range

    outlier_indicies = np.where((column < lower_fence) | (column > upper_fence))[0]
    outliers = np.array(column[outlier_indicies])
    return outliers, outlier_indicies
outliers, indexes = getOutliers(demo_df2.income)
demo_df3 = demo_df2.drop(indexes)
plt.hist(demo_df2.age, bins = 20, edgecolor = "black")
(array([ 7., 12., 11., 17., 11., 10., 10., 17., 14., 18., 14., 17., 14.,
         6.,  7.,  6.,  5.,  2.,  1.,  1.]),
 array([19.  , 21.85, 24.7 , 27.55, 30.4 , 33.25, 36.1 , 38.95, 41.8 ,
        44.65, 47.5 , 50.35, 53.2 , 56.05, 58.9 , 61.75, 64.6 , 67.45,
        70.3 , 73.15, 76.  ]),
 <BarContainer object of 20 artists>)

plt.hist(demo_df3.income, bins=20, edgecolor="black")
(array([11., 24., 37., 12., 18., 18.,  8.,  9.,  9.,  9.,  6.,  2.,  2.,
         2.,  4.,  2.,  0.,  1.,  2.,  5.]),
 array([ 11.  ,  17.85,  24.7 ,  31.55,  38.4 ,  45.25,  52.1 ,  58.95,
         65.8 ,  72.65,  79.5 ,  86.35,  93.2 , 100.05, 106.9 , 113.75,
        120.6 , 127.45, 134.3 , 141.15, 148.  ]),
 <BarContainer object of 20 artists>)

plt.scatter(demo_df2.age, demo_df2.income)
plt.show()

plt.scatter(demo_df3.age, demo_df3.income)
plt.show()

demo_df2 = demo_df2[demo_df.income<600]
demo_df2.isna().sum()
age                0
gender             0
marital_status     0
address            0
income             0
income_category    0
job_category       0
dtype: int64
demo_df2.head()
age gender marital_status address income income_category job_category
0 55 f 1 12 72.0 3.0 3
1 56 m 0 29 153.0 4.0 3
2 28 f no answer 9 28.0 2.0 1
3 24 m 1 4 26.0 2.0 1
4 25 m no answer 2 23.0 1.0 2
demo_df4 = demo_df2[demo_df2.marital_status != 'no answer'].copy()
demo_df4.to_csv('../assets/demo_cleaned.csv', index=False)
demo_df4.sample(n=5)
age gender marital_status address income income_category job_category
119 53 f 0 34 136.0 4.0 3
6 44 m 1 17 144.0 4.0 3
80 38 m 0 7 42.0 2.0 1
76 19 f 1 0 13.0 1.0 1
59 28 m 0 9 28.0 2.0 2
demo_df4.info()
<class 'pandas.core.frame.DataFrame'>
Index: 193 entries, 0 to 199
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              193 non-null    int64  
 1   gender           193 non-null    object 
 2   marital_status   193 non-null    object 
 3   address          193 non-null    int64  
 4   income           193 non-null    float64
 5   income_category  193 non-null    float64
 6   job_category     193 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 12.1+ KB
demo_df4['marital_status'] = demo_df4.marital_status.astype('int')
demo_df5 = demo_df4.copy()
demo_df5 = pd.get_dummies(data=demo_df5, 
                          columns=['gender'], 
                          drop_first=True, 
                          dtype='int'
                         )
demo_df5.sample(n=5)
age marital_status address income income_category job_category gender_m
51 48 0 22 109.0 4.0 2 1
183 38 1 18 77.0 4.0 3 0
85 30 0 4 23.0 1.0 1 0
17 21 0 1 37.0 2.0 1 1
156 43 1 5 144.0 4.0 3 1
logistic_regression_model = LogisticRegression()
X = demo_df5.drop('marital_status', axis=1)
y = demo_df5.marital_status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
logistic_regression_model.fit(X, y)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
logistic_regression_model.score(X, y) * 100
54.40414507772021
logistic_regression_model.fit(X_train, y_train)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
logistic_regression_model.score(X_test, y_test)
0.42857142857142855
demo_df2[demo_df2.marital_status == 'no answer']
age gender marital_status address income income_category job_category
2 28 f no answer 9 28.0 2.0 1
4 25 m no answer 2 23.0 1.0 2
7 46 m no answer 20 75.0 4.0 3
8 41 m no answer 10 26.0 2.0 2
9 29 f no answer 4 19.0 1.0 2
logistic_regression_model.predict([[28, 9, 28, 2, 1, 0]])
/home/jumashafara/venvs/programming_for_data_science/lib/python3.10/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(
array([0])
predictions = logistic_regression_model.predict(X_test)
# X_test['predicted_marial_status'] = predictions
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, y_train)
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
decision_tree_classifier.score(X_test, y_test)
0.4897959183673469
decision_tree_classifier.predict(X=[[28, 9, 28, 2, 1, 0]])
/home/jumashafara/venvs/programming_for_data_science/lib/python3.10/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
  warnings.warn(
array([0])
decision_tree_classifier.predict(X=X_test)
array([1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1])
# take in X_test, y_test
# predictions on X_test
# true values ie y_test
# match which are correct
# correct/total
Back to top